This notebook covers the Machine Learning process used to analyse and predict the weekly sales in Walmart train.csv and test.csv
The method used for predictions is Random Forest Regression which gives us an accuracy of 99%
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import seaborn as sns
from datetime import datetime
import statsmodels.api as sm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error, mean_squared_error
train = pd.read_csv('data/train.csv')
feature = pd.read_csv('data/features.csv')
test = pd.read_csv('data/test.csv')
stores = pd.read_csv('data/stores.csv')
# For Output_excel file...
writer=pd.ExcelWriter('Walmart Store Sales Prediction output.xlsx', engine='xlsxwriter')
Merge (train+store+features) Merge (test+stores+features)
train_bt = pd.merge(train,stores)
train = pd.merge(train_bt,feature)
#
test_bt = pd.merge(test,stores)
test= pd.merge(test_bt,feature)
train.head(2)
test.head(2)
numeric_var_train=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_train=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['object']]
# Train Numerical Data
train_num=train[numeric_var_train]
# Train Categorical Data
train_cat=train[cat_var_train]
print (numeric_var_train)
print (cat_var_train)
# Creating Data audit Report
# Use a general function that returns multiple values
def var_summary(x):
return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(), x.std(), x.var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()],
index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])
num_summary=train_num.apply(lambda x: var_summary(x)).T
num_summary.to_excel(writer,'Numeric_variable Summary',index=True)
num_summary
def cat_summary(x):
return pd.Series([x.count(), x.isnull().sum(), x.value_counts()],
index=['N', 'NMISS', 'ColumnsNames'])
cat_summary=train_cat.apply(lambda x: cat_summary(x))
cat_summary
numeric_var_test=[key for key in dict(test.dtypes) if dict(test.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_test=[key for key in dict(test.dtypes) if dict(test.dtypes)[key] in ['object']]
# Train Numerical Data
test_num=test[numeric_var_test]
# Train Categorical Data
test_cat=test[cat_var_test]
print (numeric_var_test)
print (cat_var_test)
num_summary=test_num.apply(lambda x: var_summary(x)).T
#num_summary.to_excel(writer,'Numeric_variable Summary',index=True)
num_summary.head()
def cat_summary(x):
return pd.Series([x.count(), x.isnull().sum(), x.value_counts()],
index=['N', 'NMISS', 'ColumnsNames'])
cat_summary=test_cat.apply(lambda x: cat_summary(x))
cat_summary
train_corr=pd.DataFrame(train.corr())
train_corr.to_excel(writer,'Train_Data Corr',index=True)
train_corr.head()
test_corr=pd.DataFrame(test.corr())
#test_corr.to_excel(writer,'Test_Data Corr',index=True)
test_corr.head()
# visualize correlation matrix in Seaborn using a heatmap
sns.heatmap(train.corr())
# visualize correlation matrix in Seaborn using a heatmap
sns.heatmap(test.corr())
train['Store'].value_counts(normalize=True).plot(kind = 'bar',fig=(4,5))
This graph shows that There are more number of Store 13 and very less number of Store 36
train.plot(kind='line', x='Weekly_Sales', y='Store', alpha=0.5)
Store 10 has highest sale
sns.barplot(x=train["Weekly_Sales"],y=train["Type"])
Type A stores have more sales compared to Type C
train.plot(kind='line', x='Dept', y='Weekly_Sales', alpha=1.5,fig=(4,5))
Department with highest sales falls between 60 and 80
train.isnull().sum()
test.isnull().sum()
Replacing missing data with it's mean
test['CPI']=test.groupby(['Dept'])['CPI'].transform(lambda x: x.fillna(x.mean()))
test['Unemployment']=test.groupby(['Dept'])['Unemployment'].transform(lambda x: x.fillna(x.mean()))
As far as markdowns are considered, we can fill it with 0, which means 0 markdown
train=train.fillna(0)
test=test.fillna(0)
train.isnull().sum()
test.isnull().sum()
train.Weekly_Sales=np.where(train.Weekly_Sales>100000, 100000,train.Weekly_Sales)
train.Weekly_Sales.plot.hist(bins=25)
Selection of features to train the classifier. Creation of new features from existing features. Conversion of categorical features into numeric form.
train.info()
test.info()
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
# Extract date features
train['Date_dayofweek'] =train['Date'].dt.dayofweek
train['Date_month'] =train['Date'].dt.month
train['Date_year'] =train['Date'].dt.year
train['Date_day'] =train['Date'].dt.day
test['Date_dayofweek'] =test['Date'].dt.dayofweek
test['Date_month'] =test['Date'].dt.month
test['Date_year'] =test['Date'].dt.year
test['Date_day'] =test['Date'].dt.day
train.Type.value_counts()
test.Type.value_counts()
train.IsHoliday.value_counts()
test.IsHoliday.value_counts()
train_test_data = [train, test]
Conversion of categorical cariable 'Type' into Numerical Variable
(A=1 , B=2, C=3)
type_mapping = {"A": 1, "B": 2, "C": 3}
for dataset in train_test_data:
dataset['Type'] = dataset['Type'].map(type_mapping)
Conversion of categorical variable 'IsHoliday' into numerical variable
type_mapping = {False: 0, True: 1}
for dataset in train_test_data:
dataset['IsHoliday'] = dataset['IsHoliday'].map(type_mapping)
Creation of holiday variables like Chrismas and Thanksgiving If that week comes under holiday, then 1= YES else 2= NO
Creation of holiday variable
train['Thanksgiving'] = np.where((train['Date']==datetime(2010, 11, 26)) | (train['Date']==datetime(2011, 11, 25)) | (train['Date']==datetime(2012, 11, 23)) | (train['Date']==datetime(2013, 11, 29)),1,0)
train['Christmas'] = np.where((train['Date']==datetime(2010, 12, 31)) | (train['Date']==datetime(2011, 12, 30)) | (train['Date']==datetime(2012, 12, 28)) | (train['Date']==datetime(2013, 12, 27)),1,0)
test['Thanksgiving'] = np.where((test['Date']==datetime(2010, 11, 26)) | (test['Date']==datetime(2011, 11, 25)) | (test['Date']==datetime(2012, 11, 23)) | (test['Date']==datetime(2013, 11, 29)),1,0)
test['Christmas'] = np.where((test['Date']==datetime(2010, 12, 31)) | (test['Date']==datetime(2011, 12, 30)) | (test['Date']==datetime(2012, 12, 28)) | (test['Date']==datetime(2013, 12, 27)),1,0)
train.Christmas.value_counts()
train.Thanksgiving.value_counts()
test.Christmas.value_counts()
test.Thanksgiving.value_counts()
As we have imputed 'IsHoliday', we can drop these extra holidays. (Removal of redundant factors)
dp=['Thanksgiving','Christmas']
train.drop(dp,axis=1,inplace=True)
test.drop(dp,axis=1,inplace=True)
train.info()
As we have already imputed Markdown variables, so we will not remove it. We will only remove Markdown5 because it is highly skewed
features_drop=['Unemployment','CPI','MarkDown5']
train=train.drop(features_drop, axis=1)
test=test.drop(features_drop, axis=1)
train.head(2)
test.head(2)
#### train X= Exery thing except Weekly_Sales
train_X=train.drop(['Weekly_Sales','Date'], axis=1)
#### train Y= Only Weekly_Sales
train_y=train['Weekly_Sales']
test_X=test.drop('Date',axis=1).copy()
train_X.shape, train_y.shape, test_X.shape
Random Forest
clf = RandomForestRegressor(n_estimators=100)
clf.fit(train_X, train_y)
y_pred_rf=clf.predict(test_X)
acc_rf= round(clf.score(train_X, train_y) * 100, 2)
print ("Accuracy: %i %% \n"%acc_rf)
models = pd.DataFrame({
'Model': ['Random Forest'],
'Score': [acc_rf]
})
models.sort_values(by='Score', ascending=False)
submission = pd.DataFrame({
"Store_Dept_Date": test.Store.astype(str)+'_'+test.Dept.astype(str)+'_'+test.Date.astype(str),
"Weekly_Sales": y_pred_rf
})
submission.to_csv('weekly_sales predicted.csv', index=False)
submission.head(5)